This is my first data science and machine leearning project. i am using the titanic dataset from kaggle.¶

I want to do the titanic survival prediction project¶

so Help me God¶

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
#loading the dataset
df=pd.read_csv('Titanic.csv')
In [3]:
#displaying the first five columns of the data
df.head()
Out[3]:
sex age sibsp parch fare embarked class who alone survived
0 male 22.0 1 0 7.2500 S Third man False 0
1 female 38.0 1 0 71.2833 C First woman False 1
2 female 26.0 0 0 7.9250 S Third woman True 1
3 female 35.0 1 0 53.1000 S First woman False 1
4 male 35.0 0 0 8.0500 S Third man True 0
In [4]:
#displaying the last five columns of the data
df.tail()
Out[4]:
sex age sibsp parch fare embarked class who alone survived
886 male 27.0 0 0 13.00 S Second man True 0
887 female 19.0 0 0 30.00 S First woman True 1
888 female NaN 1 2 23.45 S Third woman False 0
889 male 26.0 0 0 30.00 C First man True 1
890 male 32.0 0 0 7.75 Q Third man True 0
In [5]:
#getting the no of rows in our data
len(df)
Out[5]:
891
In [6]:
#getting the columns in our data
df.columns
Out[6]:
Index(['sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who',
       'alone', 'survived'],
      dtype='object')

diplaying the number of values in each sample class

In [7]:
df['age'].value_counts()
Out[7]:
age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64
In [8]:
df['fare'].value_counts()
Out[8]:
fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
35.0000     1
28.5000     1
6.2375      1
14.0000     1
10.5167     1
Name: count, Length: 248, dtype: int64
In [9]:
df['who'].value_counts()
Out[9]:
who
man      537
woman    271
child     83
Name: count, dtype: int64
In [10]:
df['alone'].value_counts()
Out[10]:
alone
True     537
False    354
Name: count, dtype: int64
In [11]:
df['class'].value_counts()
Out[11]:
class
Third     491
First     216
Second    184
Name: count, dtype: int64
In [12]:
df['alone'].value_counts()
Out[12]:
alone
True     537
False    354
Name: count, dtype: int64
In [13]:
df.describe()
Out[13]:
age sibsp parch fare survived
count 714.000000 891.000000 891.000000 891.000000 891.000000
mean 29.699118 0.523008 0.381594 32.204208 0.383838
std 14.526497 1.102743 0.806057 49.693429 0.486592
min 0.420000 0.000000 0.000000 0.000000 0.000000
25% 20.125000 0.000000 0.000000 7.910400 0.000000
50% 28.000000 0.000000 0.000000 14.454200 0.000000
75% 38.000000 1.000000 0.000000 31.000000 1.000000
max 80.000000 8.000000 6.000000 512.329200 1.000000
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       891 non-null    object 
 1   age       714 non-null    float64
 2   sibsp     891 non-null    int64  
 3   parch     891 non-null    int64  
 4   fare      891 non-null    float64
 5   embarked  889 non-null    object 
 6   class     891 non-null    object 
 7   who       891 non-null    object 
 8   alone     891 non-null    bool   
 9   survived  891 non-null    int64  
dtypes: bool(1), float64(2), int64(3), object(4)
memory usage: 63.6+ KB
In [15]:
df.isnull().sum()
Out[15]:
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
class         0
who           0
alone         0
survived      0
dtype: int64

VISUALIZATION 1.HISTOGRAMS

In [16]:
df['age'].hist()
Out[16]:
<Axes: >
No description has been provided for this image
In [17]:
df['who'].hist()
Out[17]:
<Axes: >
No description has been provided for this image
In [18]:
df['fare'].hist()
Out[18]:
<Axes: >
No description has been provided for this image
In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [20]:
df.shape
Out[20]:
(891, 10)
In [21]:
df['sex']=le.fit_transform(df['sex'])
df.head()
Out[21]:
sex age sibsp parch fare embarked class who alone survived
0 1 22.0 1 0 7.2500 S Third man False 0
1 0 38.0 1 0 71.2833 C First woman False 1
2 0 26.0 0 0 7.9250 S Third woman True 1
3 0 35.0 1 0 53.1000 S First woman False 1
4 1 35.0 0 0 8.0500 S Third man True 0
In [22]:
df['who']=le.fit_transform(df['who'])
df.head()
df.tail()
Out[22]:
sex age sibsp parch fare embarked class who alone survived
886 1 27.0 0 0 13.00 S Second 1 True 0
887 0 19.0 0 0 30.00 S First 2 True 1
888 0 NaN 1 2 23.45 S Third 2 False 0
889 1 26.0 0 0 30.00 C First 1 True 1
890 1 32.0 0 0 7.75 Q Third 1 True 0
In [23]:
df['alone']=le.fit_transform(df['alone'])
df.head()
Out[23]:
sex age sibsp parch fare embarked class who alone survived
0 1 22.0 1 0 7.2500 S Third 1 0 0
1 0 38.0 1 0 71.2833 C First 2 0 1
2 0 26.0 0 0 7.9250 S Third 2 1 1
3 0 35.0 1 0 53.1000 S First 2 0 1
4 1 35.0 0 0 8.0500 S Third 1 1 0
In [24]:
df['class']=le.fit_transform(df['class'])
df.head()
Out[24]:
sex age sibsp parch fare embarked class who alone survived
0 1 22.0 1 0 7.2500 S 2 1 0 0
1 0 38.0 1 0 71.2833 C 0 2 0 1
2 0 26.0 0 0 7.9250 S 2 2 1 1
3 0 35.0 1 0 53.1000 S 0 2 0 1
4 1 35.0 0 0 8.0500 S 2 1 1 0
In [25]:
df['embarked']=le.fit_transform(df['embarked'])
df.head()
Out[25]:
sex age sibsp parch fare embarked class who alone survived
0 1 22.0 1 0 7.2500 2 2 1 0 0
1 0 38.0 1 0 71.2833 0 0 2 0 1
2 0 26.0 0 0 7.9250 2 2 2 1 1
3 0 35.0 1 0 53.1000 2 0 2 0 1
4 1 35.0 0 0 8.0500 2 2 1 1 0
In [26]:
df.corr()
Out[26]:
sex age sibsp parch fare embarked class who alone survived
sex 1.000000 0.093254 -0.114631 -0.245489 -0.182333 0.104057 0.131900 -0.639773 0.303646 -0.543351
age 0.093254 1.000000 -0.308247 -0.189119 0.096067 -0.025252 -0.369226 0.378685 0.198270 -0.077221
sibsp -0.114631 -0.308247 1.000000 0.414838 0.159651 0.066654 0.083081 -0.136003 -0.584471 -0.035322
parch -0.245489 -0.189119 0.414838 1.000000 0.216225 0.038322 0.018443 -0.055682 -0.583398 0.081629
fare -0.182333 0.096067 0.159651 0.216225 1.000000 -0.221226 -0.549500 0.146290 -0.271832 0.257307
embarked 0.104057 -0.025252 0.066654 0.038322 -0.221226 1.000000 0.157112 -0.060177 0.065610 -0.163517
class 0.131900 -0.369226 0.083081 0.018443 -0.549500 0.157112 1.000000 -0.196793 0.135207 -0.338481
who -0.639773 0.378685 -0.136003 -0.055682 0.146290 -0.060177 -0.196793 1.000000 0.006540 0.325753
alone 0.303646 0.198270 -0.584471 -0.583398 -0.271832 0.065610 0.135207 0.006540 1.000000 -0.203367
survived -0.543351 -0.077221 -0.035322 0.081629 0.257307 -0.163517 -0.338481 0.325753 -0.203367 1.000000
In [27]:
corr=df.corr()
fig, ax= plt.subplots(figsize=(10,10))
sns.heatmap(corr, annot =True, ax=ax,cmap='coolwarm')
Out[27]:
<Axes: >
No description has been provided for this image
In [28]:
#scatterplots
sns.pairplot(df,hue='survived')
plt.show()
No description has been provided for this image
In [29]:
sns.boxplot(y='age', data=df,width=0.2)
Out[29]:
<Axes: ylabel='age'>
No description has been provided for this image
In [30]:
sns.boxplot(y='age',x='sex',hue='survived', data=df,width=0.2)
Out[30]:
<Axes: xlabel='sex', ylabel='age'>
No description has been provided for this image
In [31]:
df.head(10)
Out[31]:
sex age sibsp parch fare embarked class who alone survived
0 1 22.0 1 0 7.2500 2 2 1 0 0
1 0 38.0 1 0 71.2833 0 0 2 0 1
2 0 26.0 0 0 7.9250 2 2 2 1 1
3 0 35.0 1 0 53.1000 2 0 2 0 1
4 1 35.0 0 0 8.0500 2 2 1 1 0
5 1 NaN 0 0 8.4583 1 2 1 1 0
6 1 54.0 0 0 51.8625 2 0 1 1 0
7 1 2.0 3 1 21.0750 2 2 0 0 0
8 0 27.0 0 2 11.1333 2 2 2 0 1
9 0 14.0 1 0 30.0708 0 1 0 0 1
In [33]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 3
      1 from sklearn.impute import SimpleImputer
      2 imputer = SimpleImputer(strategy='mean')
----> 3 x_train = imputer.fit_transform(x_train)
      4 x_test = imputer.transform(x_test)

NameError: name 'x_train' is not defined

MODEL TRAINING

In [34]:
from sklearn.model_selection import train_test_split
X=df.drop(columns=['survived'])
Y=df['survived']
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
In [35]:
print(y_train.unique())  # If using pandas Series
[0 1]
In [36]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)
In [37]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
C:\Users\Win\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[37]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()

Print metric to get performance

In [38]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 79.88826815642457
In [39]:
#K-NEAREST NEIGHBORS
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(x_train,y_train)
Out[39]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [40]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 71.50837988826815
In [41]:
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
Out[41]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [42]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 79.3296089385475
In [43]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train,y_train)
Out[43]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [44]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 82.12290502793296
In [45]:
#use a trained model to make predict
print(x_test[:5])  # Show first 5 rows of the NumPy array
[[ 1.         29.49884615  1.          1.         15.2458      0.
   2.          1.          0.        ]
 [ 1.         31.          0.          0.         10.5         2.
   1.          1.          1.        ]
 [ 1.         20.          0.          0.          7.925       2.
   2.          1.          1.        ]
 [ 0.          6.          0.          1.         33.          2.
   1.          0.          0.        ]
 [ 0.         14.          1.          0.         11.2417      0.
   2.          0.          0.        ]]
In [46]:
import pandas as pd

# You need to know the column names — assuming X.columns from the original DataFrame
x_test_df = pd.DataFrame(x_test, columns=X.columns)
x_test_df.head()
Out[46]:
sex age sibsp parch fare embarked class who alone
0 1.0 29.498846 1.0 1.0 15.2458 0.0 2.0 1.0 0.0
1 1.0 31.000000 0.0 0.0 10.5000 2.0 1.0 1.0 1.0
2 1.0 20.000000 0.0 0.0 7.9250 2.0 2.0 1.0 1.0
3 0.0 6.000000 0.0 1.0 33.0000 2.0 1.0 0.0 0.0
4 0.0 14.000000 1.0 0.0 11.2417 0.0 2.0 0.0 0.0
In [47]:
y_test.head()
Out[47]:
709    1
439    0
840    0
720    1
39     1
Name: survived, dtype: int64
In [48]:
clf= RandomForestClassifier()
clf.fit(x_train,y_train)
(clf.score(x_test,y_test)*100)
Out[48]:
82.12290502793296
In [49]:
clf.predict(x_test)
Out[49]:
array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1], dtype=int64)
In [50]:
np.array(y_test)
Out[50]:
array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1], dtype=int64)
In [51]:
#compare the prediction to the truth labels to evaluate the model
y_preds=clf.predict(x_test)
(np.mean(y_preds == y_test)*100)
Out[51]:
82.12290502793296
In [52]:
pip install plotly
Requirement already satisfied: plotly in c:\users\win\appdata\local\programs\python\python312\lib\site-packages (6.1.2)
Requirement already satisfied: narwhals>=1.15.1 in c:\users\win\appdata\local\programs\python\python312\lib\site-packages (from plotly) (1.43.0)
Requirement already satisfied: packaging in c:\users\win\appdata\local\programs\python\python312\lib\site-packages (from plotly) (23.2)
Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
In [53]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'  # or 'iframe', 'colab', or 'browser' if needed


fig = px.bar(df, x='age', y='sex', color='survived')
fig.show()
In [61]:
df.plot()
plt.title("a general plot for the dataframe")
plt.show()
No description has been provided for this image
In [60]:
df.plot(kind='scatter',x='age',y='class')
plt.title("class vs age")
plt.xlabel("age")
plt.ylabel("class")
plt.show()
No description has been provided for this image
In [ ]: